In [5]:
import pandas as pd
import numpy as np
from pyproj import Proj, transform
from pyproj import Transformer
import matplotlib.pyplot as plt
import folium
from IPython.display import display
from matplotlib import cm
import seaborn as sns
from matplotlib.ticker import PercentFormatter
from scipy.stats import ttest_ind
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
In [6]:
TRAN_2193_TO_4326 = Transformer.from_crs("EPSG:2193", "EPSG:4326")
def transform_coord(lat, lon):
    return TRAN_2193_TO_4326.transform(lat, lon)
In [7]:
raw_df = pd.read_csv("/Users/yasirmuhammad/Downloads/Crash_Analysis_System_(CAS)_data.csv")
raw_df.head(10)
Out[7]:
X Y OBJECTID advisorySpeed areaUnitID bicycle bridge bus carStationWagon cliffBank ... train tree truck unknownVehicleType urban vanOrUtility vehicle waterRiver weatherA weatherB
0 1406914.0 4915023.0 1 NaN 602800.0 0.0 NaN 0.0 1.0 NaN ... NaN NaN 0.0 0.0 Urban 1.0 NaN NaN Fine Null
1 1799023.0 5814728.0 2 NaN 528900.0 0.0 NaN 0.0 3.0 NaN ... NaN NaN 0.0 0.0 Urban 0.0 NaN NaN Fine Null
2 1741193.0 5976099.0 3 55.0 506660.0 0.0 0.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 Open 0.0 0.0 0.0 Light rain Null
3 1790288.0 5837387.0 4 NaN 526601.0 0.0 0.0 0.0 1.0 1.0 ... 0.0 0.0 0.0 0.0 Open 0.0 0.0 0.0 Light rain Null
4 1615675.0 5423627.0 5 NaN 584201.0 0.0 NaN 0.0 2.0 NaN ... NaN NaN 0.0 0.0 Urban 0.0 NaN NaN Fine Null
5 1838932.0 5519398.0 6 NaN 577900.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 Open 0.0 0.0 0.0 Heavy rain Null
6 1893341.0 5812746.0 7 NaN 538502.0 0.0 NaN 0.0 1.0 NaN ... NaN NaN 0.0 0.0 Open 1.0 NaN NaN Fine Null
7 1570406.0 5178315.0 8 NaN 594600.0 0.0 0.0 0.0 4.0 0.0 ... 0.0 0.0 0.0 0.0 Urban 0.0 0.0 0.0 Fine Null
8 1702855.0 5635351.0 9 NaN 553500.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 1.0 0.0 Open 0.0 0.0 0.0 Heavy rain Null
9 1766331.0 5907432.0 10 NaN 522302.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 Open 0.0 0.0 0.0 Fine Null

10 rows × 72 columns

Date cleaning, transformation,and feature engineering

In [8]:
df = raw_df[['X','Y','OBJECTID','bicycle','bus','carStationWagon','crashDirectionDescription','crashLocation1','crashSeverity','crashYear','fatalCount','flatHill','holiday',
'light','minorInjuryCount','moped','motorcycle','region','speedLimit','suv','taxi','train','truck','tree','weatherA']]
cols = ['bicycle','bus','carStationWagon','fatalCount','minorInjuryCount','moped','motorcycle','suv','taxi','train','truck','tree']

df[cols] = df[cols].replace(np.nan,0)
df[cols].replace(np.nan,0)

df['holiday'] = df['holiday'].replace(np.nan,'NPH')
df['latitude'] = transform_coord(df['Y'],df['X'])[0]
df['longitude'] = transform_coord( df['Y'],df['X'])[1]
df.drop(['X','Y'],axis=1,inplace=True)

df['vehicleInvolved'] = df['suv'] + df['taxi'] + df['carStationWagon'] + df['truck'] + df['bus']
df['Time Category'] = np.where(df['crashYear']<2010,"2000s","2010s")

Exploratory Data Analysis

In [9]:
df.groupby('crashYear')['OBJECTID'].count().plot(kind='bar',figsize=(12,7), color='blue', alpha=0.5)
plt.title('No of accidents in last 21 years',fontsize=20)
plt.ylabel('Number of accidents',fontsize=16)
plt.show()

A trend of increase in the no of accidents observed in early 2000s to mid-late 2000s. In late 2000s trend in decreasing the no of accidents observed which continued until 2015. In last 6 years, overall number of accidents increased each year. However, we observed decreased in no of accidents by 15% in 2020 compared to 2019.

In [10]:
plt.subplot(1,2,1)

df.groupby('crashYear')['fatalCount'].sum().plot(kind='bar',figsize=(15,7), color='red', alpha=0.5)
plt.title('Fatalities in last 21 years',fontsize=20)
plt.ylabel('Fatalities',fontsize=16)

plt.subplot(1,2,2)
df.groupby('crashYear')['minorInjuryCount'].sum().plot(kind='bar',figsize=(15,7), color='blue', alpha=0.5)
plt.title('Injuries in last 21 years',fontsize=20)
plt.ylabel('No of Injuries',fontsize=16)

plt.show()
In [11]:
mean_2000s = df[(df['Time Category']=='2000s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].mean()
mean_2010s = df[(df['Time Category']=='2010s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].mean()
t_result = ttest_ind(df[(df['Time Category']=='2000s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'], df[(df['Time Category']=='2010s')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'],equal_var=False)

print(f"The mean of 2000s era is {mean_2000s}, mean of 2010s era is {mean_2010s}, and t-test is {t_result}")
The mean of 2000s era is 1.144927536231884, mean of 2010s era is 1.1157449380477487, and t-test is Ttest_indResult(statistic=2.642128292866719, pvalue=0.00825710861367675)
In [12]:
mean_2000s_inj = df[(df['Time Category']=='2000s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'].mean()
mean_2010s_inj = df[(df['Time Category']=='2010s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'].mean()
t_result_inj = ttest_ind(df[(df['Time Category']=='2000s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'], df[(df['Time Category']=='2010s')&(df['minorInjuryCount']!=0)]['minorInjuryCount'],equal_var=False)

print(f"The mean of 2000s era is {mean_2000s_inj}, mean of 2010s era is {mean_2010s_inj}, and t-test is {t_result_inj}")
The mean of 2000s era is 1.312814465408805, mean of 2010s era is 1.239496347210917, and t-test is Ttest_indResult(statistic=23.721515653412073, pvalue=3.375856517203661e-124)

p-value is less than 0.05, so the result is said to be statistically significant

In [16]:
df2 = df[df['crashSeverity']=='Fatal Crash']
nz_map = folium.Map(location=[-40.9006,174.8860], zoom_start=6)

for lat, lng, label in zip(df2.latitude, df2.longitude, df2.fatalCount.astype(str)):
    if label!='0':
        
        folium.features.CircleMarker(
            [lat, lng],
            radius=3,
            color='red',
            fill=True,
            popup=label,
            fill_color='darkred',
            fill_opacity=0.6
        ).add_to(nz_map)
nz_map
Out[16]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [31]:
cs = cm.Paired(np.arange(16)/16.)
plt.subplot(1,2,1)
explode1 = (0.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.2)
explode2 = (0,0,0,0,0,0,0,0.2,0,0,0,0,0,0.1,0,0)

df.groupby('region')['OBJECTID'].count().plot(kind='pie',figsize=(12,7),autopct='%1.1f%%',labeldistance=None,explode=explode1,colors=cs)
plt.ylabel('')
plt.subplot(1,2,2)

df.groupby('region')['fatalCount'].sum().plot(kind='pie',figsize=(15,7),autopct='%1.1f%%',labeldistance=None,explode=explode2,colors=cs)
plt.ylabel('')
plt.legend(loc=(0.5,1),ncol=2)
plt.suptitle("No of accidents & fatalities in last 21 years",size=20)
plt.show()

Auckland region has the highest proportion of accident as compared to other regions. However, Waikato region has highest proportion (i.e. 19.6%) of fatalities in last 21 years. So, it means that severity of accident is highest in the country.

In [19]:
pareto_df =df.groupby(['region'])['OBJECTID','fatalCount'].agg({'OBJECTID':'count','fatalCount':'sum'}).reset_index().rename(columns={'OBJECTID':'accidents'})
pareto_df.sort_values(['accidents'],ascending=False,inplace=True)
pareto_df.reset_index(inplace=True)
pareto_df['%CumSum accidents'] = (100 * pareto_df['accidents'].cumsum()/pareto_df['accidents'].sum())
#pareto_df['%cumSum accidents'] = (100 * pareto_df['accidents']/pareto_df['accidents'].sum())
#pareto_df['cumSum percent'] = pareto_df['%cumSum accidents'].cumsum()
fig,ax1 = plt.subplots()
ax1.bar(pareto_df['region'],pareto_df['accidents'],color="C0")
ax1.set_ylabel('No of accidents',color="C0")
ax1.set_xlabel("Regions")
ax1.set_xticklabels(pareto_df['region'],rotation=90)
ax2=ax1.twinx()
ax2.plot(pareto_df['region'],pareto_df['%CumSum accidents'],color="C1",marker="D",ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax2.tick_params(axis="y",colors="C1")
plt.title("Pareto Chart- Accidents")
Out[19]:
Text(0.5, 1.0, 'Pareto Chart- Accidents')
In [20]:
pareto_df.sort_values(['fatalCount'],ascending=False,inplace=True)
pareto_df.reset_index(inplace=True,drop=True)
pareto_df['%CumSum fatalcount'] = (100 * pareto_df['fatalCount'].cumsum()/pareto_df['fatalCount'].sum())
fig,ax1 = plt.subplots()
ax1.bar(pareto_df['region'],pareto_df['fatalCount'],color="C0")
ax1.set_ylabel('fatalCount',color="C0")
ax1.set_xlabel("Regions")
ax1.set_xticklabels(pareto_df['region'],rotation=90)
ax2=ax1.twinx()
ax2.plot(pareto_df['region'],pareto_df['%CumSum fatalcount'],color="C1",marker="D",ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax2.tick_params(axis="y",colors="C1")
plt.title("Pareto Chart- Fatalities")
Out[20]:
Text(0.5, 1.0, 'Pareto Chart- Fatalities')
In [21]:
regions_acc = df.groupby(['crashYear','region'])['OBJECTID','fatalCount'].agg({'OBJECTID':'count','fatalCount':'sum'}).reset_index()
ax= sns.catplot(data=regions_acc,kind='bar',y='fatalCount',x='crashYear',col_wrap=4,col='region')
ax.set_xticklabels(rotation=60)
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x7fd997f84240>
In [22]:
vehicles_involved = df['vehicleInvolved'].value_counts()
vehicles_involved.sort_index(inplace=True)
prct_vehcile_inv = 1 - (vehicles_involved[0]/vehicles_involved.sum())

prct_one_vehcile_inv =  (vehicles_involved[1]/vehicles_involved[1:].sum())
prct_two_vehcile_inv =  (vehicles_involved[2]/vehicles_involved[1:].sum())
In [23]:
vehicles_involved = vehicles_involved[1:]
vehicles_involved.plot(kind='bar',figsize=(12,7), color='darkblue', alpha=0.5)
plt.title('Accidents in last 21 years',fontsize=20)
plt.xlabel('Vehicles involved',fontsize=16)
plt.ylabel('Number of accidents',fontsize=16)


for index in vehicles_involved.index:
    
    plt.text(x=index-1,y=vehicles_involved.loc[index],s=str(vehicles_involved.loc[index]),horizontalalignment='center')
plt.show()

Around 94% of accidents occured in last 21 years involved vehicle(s). Around 48% of those accidents involved two vehicles and about 45% of those accidents was reported as one vehcile involvement

In [24]:
df[(df['vehicleInvolved']>0) & (df['vehicleInvolved']<=2)].groupby('vehicleInvolved')['fatalCount'].sum().plot(kind='bar')
plt.title("Distribution of fatalities w.r.t vehcile involved")
Out[24]:
Text(0.5, 1.0, 'Distribution of fatalities w.r.t vehcile involved')

About 88% more fatalities in accidents where single vehcile involved compared to accidents that have two vehciles affected.

In [25]:
vehicle_type_light_count = df[(df['vehicleInvolved']>0) & (df['vehicleInvolved']<=2)].groupby(['vehicleInvolved','light'])['fatalCount'].sum().reset_index()
vehicle_type_light_count['vehicleInvolved'].replace({1.0:"One Vehcile",2.0:"Two Vehciles"},inplace=True)

sns.catplot(data=vehicle_type_light_count,x='light',y='fatalCount',kind='bar',orient='v',hue='vehicleInvolved')
plt.title('Number of Fatalities')
Out[25]:
Text(0.5, 1.0, 'Number of Fatalities')
In [26]:
BS = df[(df['light']=='Bright sun')&(df['crashSeverity']=='Fatal Crash')&(df['vehicleInvolved']==1)]['fatalCount'].values
DK = df[(df['light']=='Dark')&(df['crashSeverity']=='Fatal Crash')&(df['vehicleInvolved']==1)]['fatalCount'].values
OC = df[(df['light']=='Overcast')&(df['crashSeverity']=='Fatal Crash')&(df['vehicleInvolved']==1)]['fatalCount'].values


K_result = stats.kruskal(BS,DK,OC)
print(f"Kruskal-Wallis H test result is {K_result} and it rejects the null hypothesis, i.e. there is significant difference among groups")
Kruskal-Wallis H test result is KruskalResult(statistic=6.011805764282692, pvalue=0.04949404685768179) and it rejects the null hypothesis, i.e. there is significant difference among groups
In [27]:
plt.subplot(1,2,1)

df.groupby('light')['fatalCount'].sum().plot(kind='bar',figsize=(15,7), color='red', alpha=0.5)
plt.title('Fatalities in last 21 years',fontsize=20)
plt.ylabel('Fatalities',fontsize=16)

plt.subplot(1,2,2)
df.groupby('light')['minorInjuryCount'].sum().plot(kind='bar',figsize=(15,7), color='blue', alpha=0.5)
plt.title('Injuries in last 21 years',fontsize=20)
plt.ylabel('No of Injuries',fontsize=16)

plt.show()
In [28]:
BS = df[(df['light']=='Bright sun')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].values
DK = df[(df['light']=='Dark')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].values
OC = df[(df['light']=='Overcast')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].values
TL = df[(df['light']=='Twilight')&(df['crashSeverity']=='Fatal Crash')]['fatalCount'].values

K_result = stats.kruskal(BS,DK,OC,TL)
print(f"Kruskal-Wallis H test result is {K_result} and it holds the null hypothesis, i.e. there is no significant difference among groups")
Kruskal-Wallis H test result is KruskalResult(statistic=3.6720653001427217, pvalue=0.299121874775486) and it holds the null hypothesis, i.e. there is no significant difference among groups